/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.quality; import java.io.*; import java.util.*; import java.util.logging.*; import net.nutch.util.*; import net.nutch.searcher.*; /***************************************************** * This class modifies search parameters hill-climbing * style, trying to find the best set of values. It uses * the results of a QualityTestTool run as a test set. * (That will include both a query term list and a * set of results from other search engines.) * Since we're dynamically adjusting params, we need * to query a live Nutch searcher, so we'll need to * be provided the segment directories. * * @author Mike Cafarella *****************************************************/ public class SearchOptimizer { static final Logger LOG = LogFormatter.getLogger("net.nutch.quality.SearchOptimizer"); private final static int NUM_RESTARTS = 10; final static String QUERY_LIST = "queryList.txt"; final static String URL_INSET_SUFFIX = ".urlInset"; final static String QUERY_RESULTS_SUFFIX = ".queryResults"; final static String ENGINE_DESC_SUFFIX = ".src"; PageExtractor.NutchExtractor nutch; Random rand; Vector queryTerms; File segmentsDir; File inputsDir; TreeMap remoteExtractors = new TreeMap(); // 'engineResults' is a Map of engine name to a Map of // Query to Ordered-Result-List TreeMap engineResults = new TreeMap(); // 'insetTables' and 'outsetTables' are Maps to Sets. // Each Set contains the appropriate URLs. TreeMap insetTables = new TreeMap(), outsetTables = new TreeMap(); static float[] STEP_SIZE = {0.01f, 0.05f, 0.1f}; /** * Give the dirs where all the segments can be found, plus * the dir for existing QualityTestTool results. */ public SearchOptimizer(File segmentsDir, File inputsDir, String userAgent, Random rand) throws Exception { this.segmentsDir = segmentsDir; this.inputsDir = inputsDir; this.rand = rand; this.nutch = new PageExtractor.NutchExtractor(segmentsDir.getPath()); // // Now, load in everything from the existing QualityTestTool // directory. This includes: // 1. Query-term list // 2. Results from other engines // // // 1. Query-terms // File queryList = new File(inputsDir, QUERY_LIST); BufferedReader in = new BufferedReader(new FileReader(queryList)); this.queryTerms = new Vector(); try { String term = in.readLine(); while (term != null) { queryTerms.add(term.trim()); term = in.readLine(); } } finally { in.close(); } // // 2. Other engines' results. // File contents[] = inputsDir.listFiles(); for (int i = 0; i < contents.length; i++) { String filename = contents[i].getName(); if (filename.endsWith(URL_INSET_SUFFIX)) { // // First, load whether engines found URLs to // be in or out of set. If a URL is not found // in either table, in means that we have never // asked the engine for that URL. // // Compute the engine name int suffixStart = filename.lastIndexOf(URL_INSET_SUFFIX); String engineName = filename.substring(0, suffixStart); if (! engineName.startsWith("Nutch")) { Set insetURLs = new HashSet(); Set outsetURLs = new HashSet(); insetTables.put(engineName, insetURLs); outsetTables.put(engineName, outsetURLs); // Load in the engine's url-inset list int curInsetScore = 0; DataInputStream din = new DataInputStream(new FileInputStream(contents[i])); try { int numItems = din.readInt(); // Load in whether each URL was in-set or not. for (int j = 0; j < numItems; j++) { String url = din.readUTF(); if (din.readBoolean()) { insetURLs.add(url); } else { outsetURLs.add(url); } } } finally { din.close(); } } } else if (filename.endsWith(QUERY_RESULTS_SUFFIX)) { // // Second, load the actual returned-values that each // engine gave in response to a query. // // Compute the engine name int suffixStart = filename.lastIndexOf(QUERY_RESULTS_SUFFIX); String engineName = filename.substring(0, suffixStart); if (! engineName.startsWith("Nutch")) { // Load in results DataInputStream din = new DataInputStream(new FileInputStream(contents[i])); try { TreeMap resultLists = new TreeMap(); int numQueries = din.readInt(); for (int j = 0; j < numQueries; j++) { String query = din.readUTF(); int numResults = din.readInt(); String resultList[] = new String[numResults]; for (int k = 0; k < numResults; k++) { resultList[k] = din.readUTF(); } resultLists.put(query, resultList); } engineResults.put(engineName, resultLists); } finally { din.close(); } } } else if (filename.endsWith(ENGINE_DESC_SUFFIX)) { // Third, build up the remote-extractor db int suffixStart = filename.lastIndexOf(ENGINE_DESC_SUFFIX); String engineName = filename.substring(0, suffixStart); PageExtractor.RemotePageExtractor extractor = new PageExtractor.RemotePageExtractor(contents[i], userAgent, false); remoteExtractors.put(engineName, extractor); } } } /** * Run Nutch against the test query set, and compute a * score. We try to adjust search params so that this * score is maximized. */ double testNutch() { Map nutchResults = new TreeMap(); // // Loop through all the query terms, and ask Nutch for // responses. Record them all. // for (Enumeration e = queryTerms.elements(); e.hasMoreElements(); ) { String query = (String) e.nextElement(); Object results[] = null; try { results = nutch.applyQuery(query).toArray(); nutchResults.put(query, results); } catch (IOException ex) { continue; } } // // Iterate through all of Nutch's responses to make sure // we have tested for the URLs' presence in other engines. // String sampleKey = (String) insetTables.firstKey(); Set testInset = (Set) insetTables.get(sampleKey); Set testOutset = (Set) outsetTables.get(sampleKey); for (Iterator it = nutchResults.keySet().iterator(); it.hasNext(); ) { String query = (String) it.next(); Object results[] = (Object[]) nutchResults.get(query); for (int i = 0; i < results.length; i++) { if (!testInset.contains(results[i]) && !testOutset.contains(results[i])) { performInsetTest((String) results[i]); } } } // // Go through all the queries and compute a score for // the Nutch results. // int orderingScore = 0, normalizer = 0; for (Enumeration e = queryTerms.elements(); e.hasMoreElements(); ) { String query = (String) e.nextElement(); Object results[] = (Object[]) nutchResults.get(query); for (int i = 0; i < results.length; i++) { String result1 = (String) results[i]; for (int j = i+1; j < results.length; j++) { String result2 = (String) results[j]; int numVoters = countStatements(query, result1, result2); int term1Votes = countVotes(query, result1, result2); if ((numVoters > 1) && (term1Votes != (numVoters / 2))) { if (term1Votes > (numVoters / 2.0)) { orderingScore++; } normalizer++; } } } } return (orderingScore / (normalizer * 1.0)); } /** * countStatements() returns how many engines make some kind * of statement about the query and result-pair given. */ int countStatements(String query, String url1, String url2) { int engineCount = 0; // Iterate through all non-Nutch engines for (Iterator it = engineResults.keySet().iterator(); it.hasNext(); ) { String engineName = (String) it.next(); if (! engineName.startsWith("Nutch")) { // The engine must have both items in-set if (inset(engineName, url1) && inset(engineName, url2)) { // Now make sure at least one is in the top-10 Map resultLists = (Map) engineResults.get(engineName); String results[] = (String[]) resultLists.get(query); for (int i = 0; i < results.length; i++) { if (results[i].equals(url1) || results[i].equals(url2)) { engineCount++; } } } } } return engineCount; } /** * countVotes() returns how many votes are cast for the * given ordering of terms (in response to the query). */ int countVotes(String query, String url1, String url2) { int url1Votes = 0; // Iterate through all non-Nutch engines for (Iterator it = engineResults.keySet().iterator(); it.hasNext(); ) { String engineName = (String) it.next(); if (! engineName.startsWith("Nutch")) { // The engine must have both items in-set if (inset(engineName, url1) && inset(engineName, url2)) { // Now make sure at least one is in the top-10 Map resultLists = (Map) engineResults.get(engineName); String results[] = (String[]) resultLists.get(query); int url1Pos = Integer.MAX_VALUE, url2Pos = Integer.MAX_VALUE; for (int i = 0; i < results.length; i++) { if (results[i].equals(url1)) { url1Pos = i; } if (results[i].equals(url2)) { url2Pos = i; } } if ((url1Pos < Integer.MAX_VALUE || url2Pos < Integer.MAX_VALUE) && (url1Pos < url2Pos)) { url1Votes++; } } } } return url1Votes; } /** * Return whether the given engine has indexed the url. * Every URL should be known to be in-set or out-set by * the time we reach this point. We will have to contact * live search engines during optimization, because modifying * Nutch parameters might reveal novel URLs. * * However, we should have already found those URLs and * made the necessary modifications to "insetURLs" and "outsetURLs" * by the time we get here. */ boolean inset(String engineName, String url) { Set insetURLs = (Set) insetTables.get(engineName); if (insetURLs.contains(url)) { return true; } Set outsetURLs = (Set) outsetTables.get(engineName); if (outsetURLs.contains(url)) { return false; } throw new IllegalArgumentException("For engine " + engineName + ", a URL is always in-set or out-set."); } /** * Our inset records are lacking a URL. Test all the remote * engines and find out whether it's inset or not. * * REMIND - mjc * I know, I know, we should be writing down the results of * this test for future runs. This is coming soon. */ void performInsetTest(String query) { // // For now, we just add any unknown URL to the "out-of-set" list // for (Iterator it = engineResults.keySet().iterator(); it.hasNext(); ) { String engineName = (String) it.next(); Set outsetURLs = (Set) outsetTables.get(engineName); outsetURLs.add(query); } // // Ask every remote service to test for the presence of the URL. // for (Iterator it = remoteExtractors.keySet().iterator(); it.hasNext(); ) { String engineName = (String) it.next(); PageExtractor.RemotePageExtractor remoteExtractor = (PageExtractor.RemotePageExtractor) remoteExtractors.get(engineName); boolean inSet = false; ArrayList results = null; try { results = remoteExtractor.applyQuery(query); } catch (IOException ie) { // Count query as out-of-set LOG.info("Could not contact " + engineName + " to test " + query); } if (results != null) { for (Iterator it2 = results.iterator(); it2.hasNext(); ) { String val = (String) it2.next(); if (val.trim().compareTo(query) == 0) { inSet = true; break; } } } if (inSet) { Set insetURLs = (Set) insetTables.get(engineName); insetURLs.add(query); System.err.println("Engine " + engineName + " has " + query); } else { Set outsetURLs = (Set) outsetTables.get(engineName); outsetURLs.add(query); System.err.println("Engine " + engineName + " lacks " + query); } } } /********************************************************* * Describes the current parameter settings and how well * they perform. *********************************************************/ class ParameterLocation { float scoreParams[] = new float[3]; float phraseParam; double score; /** * Initialize the ParameterLocation with standard * start-point values. */ public ParameterLocation() { // We have three scores to balance: url, anchor, and content. // It's best to think of these as summing to 1, so that // during our work we realize adjusting one value // means we also adjust the others. Later we // convert them into a form that QueryTranslator wants. // PhraseBoost is a value between 0 and 1. Its counterpart, // UnorderedBoost is equal to (1 - PhraseBoost). randomizePosition(); } /** * Inititalize with given values */ public ParameterLocation(float scoreParams[], float phraseParam) { System.arraycopy(scoreParams, 0, this.scoreParams, 0, this.scoreParams.length); this.phraseParam = phraseParam; evaluate(); } /** */ public void getScoreParam(float newScoreParams[]) { System.arraycopy(scoreParams, 0, newScoreParams, 0, scoreParams.length); } /** */ public float getPhraseParam() { return phraseParam; } /** */ public double getScore() { return score; } /** * This is used for random restarts of the parameter-space * search. */ public void randomizePosition() { scoreParams[0] = rand.nextFloat(); scoreParams[1] = (rand.nextFloat() * (1 - scoreParams[0])); scoreParams[2] = 1 - scoreParams[0] - scoreParams[1]; this.phraseParam = rand.nextFloat(); evaluate(); } /** * Finds the score for this parameter setting. Need * to convert these values into the style that * QueryTranslator wants. */ private void evaluate() { QueryTranslator.setUrlBoost(scoreParams[0] / scoreParams[2]); QueryTranslator.setAnchorBoost(scoreParams[1] / scoreParams[2]); QueryTranslator.setPhraseBoost(phraseParam / (1 - phraseParam)); this.score = testNutch(); } /** */ public String toString() { return "PL " + "url:" + scoreParams[0] + ", anchor:" + scoreParams[1] + ", phrase:" + phraseParam + ", score:" + score; } } /** * Find the best possible setting for search parameters, * using hillclimbing. When we have reached a local * maximum, we do a random restart. We always remember * the best settings. */ public void optimizeParams() { float startingPoint[] = {1.0f/3f, 1.0f/3f, 1.0f/3f}; ParameterLocation curLoc = new ParameterLocation(startingPoint, 0.5f); ParameterLocation newLoc = null, bestLoc = null; // // We want several randomized restarts of hillclimbing // so we avoid getting trapped in local maxima. // LOG.info("Now starting search-parameter optimization"); for (int restart = 0; restart < NUM_RESTARTS; restart++) { LOG.info("Parameter-hillclimb, starting from " + curLoc); // // Take repeated best-steps until we hit a local maximum // while ((newLoc = takeBestStep(curLoc)).getScore() > curLoc.getScore()) { curLoc = newLoc; LOG.info("Ascended to " + curLoc); } // // Remember this one if it's the best we've seen // if ((bestLoc == null) || (curLoc.getScore() > bestLoc.getScore())) { bestLoc = curLoc; LOG.info("New best setting is " + bestLoc); } else { LOG.info("Best setting is still " + bestLoc); } // // Randomize for the next step, if there is one. // if (restart + 1 < NUM_RESTARTS) { LOG.info("Randomized parameters and restarting..."); curLoc = new ParameterLocation(); } } LOG.info("Found best parameter settings: " + bestLoc); } /** * Go through a number of loops that iterate through * all reasonable parameter steps from this point. * In the innermost loop, test Nutch's performance, * and remember the best score seen so far. * * Before a loop begins, be sure to restore its * values. */ private ParameterLocation takeBestStep(ParameterLocation startingPoint) { float scoreParam[] = new float[3]; float phraseParam = 0; ParameterLocation curParams = startingPoint; // // Make steps of varying size from the current location // and find the best adjustment. // for (int g = 0; g < STEP_SIZE.length; g++) { for (int scoreSign = -1; scoreSign <= 1; scoreSign+=2) { // Compute our step size for "score" float scoreStep = scoreSign * STEP_SIZE[g]; // Now make each step in each of the 3 directions // from the current "score" settings. for (int i = 0; i < scoreParam.length; i++) { // 1. Restore the values for "score" curParams.getScoreParam(scoreParam); // 2. Adjust one step for (int j = 0; j < scoreParam.length; j++) { if (j == i) { scoreParam[j] += scoreStep; } else { scoreParam[j] -= (scoreStep / 2); } } // Make sure we adjust values within legal limits if (scoreParam[0] < 0 || scoreParam[0] > 1 || scoreParam[1] < 0 || scoreParam[1] > 1 || scoreParam[2] < 0 || scoreParam[2] > 1) { continue; } // // OK, now we do the same for "phrase" // for (int k = 0; k < STEP_SIZE.length; k++) { for (int phraseSign = -1; phraseSign <= 1; phraseSign+=2) { float phraseStep = phraseSign * STEP_SIZE[k]; // 1. Restore the value for phrase phraseParam = curParams.getPhraseParam(); // 2. Adjust one step phraseParam += ((phraseSign) * STEP_SIZE[k]); if (phraseParam < 0 || phraseParam > 1) { continue; } // // Finally, test Nutch here! // ParameterLocation newParams = new ParameterLocation(scoreParam, phraseParam); LOG.info("Now testing position: " + newParams); if (newParams.getScore() > curParams.getScore()) { curParams = newParams; } } } } } } return curParams; } /** * Start hillclimbing on parameters, tested on * the given set. */ public static void main(String argv[]) throws Exception { if (argv.length < 3) { System.out.println("Usage: java net.nutch.quality.SearchOptimizer <localInputsDir> <segmentsDir> <userAgent> [-seed <seed>]"); return; } File existingDir = new File(argv[0]); File segmentsDir = new File(argv[1]); String userAgent = argv[2]; long seed = new Random().nextLong(); if (argv.length > 3 && "-seed".equals(argv[3])) { try { seed = Long.parseLong(argv[4]); } catch (NumberFormatException nfe) { System.out.println("Seed is badly-formatted: " + argv[4]); return; } } SearchOptimizer so = new SearchOptimizer(segmentsDir, existingDir, userAgent, new Random()); so.optimizeParams(); } }